library(magrittr) # quando der problema com o "%>%"
library(dplyr) # selecao e filtro de dados
library(geosphere) # localizacao geoespacial
library(lubridate) # datas, funcoes hour, month, wday
library(plotly) # plot dos graficos
library(knitr) # usada pelo plotly
library(dummies) # cria colunas binarias para variaveis categoricas
library(scales) # normaliza dados rescalando para float de 0 a 1
library(randomForest) # cria rede neural para criar regressao de tempo de viagem
library(tidyverse)
library(yaml)
source('preprocessing.R')
source('mapa_calor_ny.R')
read.csv(gzfile("./data_source/train.csv.gz")) %>%
as.data.frame() -> train
head(train, 3)
## id vendor_id pickup_datetime dropoff_datetime
## 1 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1 1 -73.98215 40.76794 -73.96463
## 2 1 -73.98042 40.73856 -73.99948
## 3 1 -73.97903 40.76394 -74.00533
## dropoff_latitude store_and_fwd_flag trip_duration
## 1 40.76560 N 455
## 2 40.73115 N 663
## 3 40.71009 N 2124
sum(is.na(train))
## [1] 0
summary(train)
## id vendor_id pickup_datetime
## id0000001: 1 Min. :1.000 2016-01-12 18:48:44: 5
## id0000003: 1 1st Qu.:1.000 2016-02-09 21:03:38: 5
## id0000005: 1 Median :2.000 2016-03-04 08:07:34: 5
## id0000008: 1 Mean :1.535 2016-04-05 18:55:21: 5
## id0000009: 1 3rd Qu.:2.000 2016-05-07 13:18:07: 5
## id0000011: 1 Max. :2.000 2016-06-10 23:17:17: 5
## (Other) :1458638 (Other) :1458614
## dropoff_datetime passenger_count pickup_longitude
## 2016-02-19 19:25:04: 5 Min. :0.000 Min. :-121.93
## 2016-05-16 19:40:28: 5 1st Qu.:1.000 1st Qu.: -73.99
## 2016-01-07 08:04:32: 4 Median :1.000 Median : -73.98
## 2016-01-08 12:43:38: 4 Mean :1.665 Mean : -73.97
## 2016-01-08 13:00:41: 4 3rd Qu.:2.000 3rd Qu.: -73.97
## 2016-01-09 15:59:42: 4 Max. :9.000 Max. : -61.34
## (Other) :1458618
## pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
## Min. :34.36 Min. :-121.93 Min. :32.18 N:1450599
## 1st Qu.:40.74 1st Qu.: -73.99 1st Qu.:40.74 Y: 8045
## Median :40.75 Median : -73.98 Median :40.75
## Mean :40.75 Mean : -73.97 Mean :40.75
## 3rd Qu.:40.77 3rd Qu.: -73.96 3rd Qu.:40.77
## Max. :51.88 Max. : -61.34 Max. :43.92
##
## trip_duration
## Min. : 1
## 1st Qu.: 397
## Median : 662
## Mean : 959
## 3rd Qu.: 1075
## Max. :3526282
##
set.seed(20)
# Criando uma sequencia de 1 ate a quantidade de total de linhas
linhas.idx <- seq_len(nrow(train))
# Obtendo aleatoriamente 10000 amostras de linhas do dataset
linhas.sample <- sample(linhas.idx, 5000)
# Amostra aleatoria com 10000 dados
df <- train[linhas.sample, ]
head(df, 10)
## id vendor_id pickup_datetime dropoff_datetime
## 1279992 id1222043 1 2016-04-23 14:30:31 2016-04-23 14:46:12
## 1121016 id1532438 1 2016-06-29 13:56:47 2016-06-29 14:37:48
## 406908 id2622202 2 2016-06-29 19:06:05 2016-06-29 19:19:51
## 771860 id1738519 1 2016-02-16 22:46:58 2016-02-16 23:32:02
## 1404535 id1217189 2 2016-05-03 16:29:38 2016-05-03 16:32:45
## 1429984 id3391016 2 2016-05-08 20:46:41 2016-05-08 20:52:50
## 133222 id2918068 2 2016-05-07 21:25:28 2016-05-07 21:44:40
## 103198 id1453370 2 2016-02-13 15:58:02 2016-02-13 16:12:25
## 477841 id0952614 1 2016-03-24 20:06:00 2016-03-24 20:29:05
## 539804 id2150157 2 2016-06-14 10:46:55 2016-06-14 10:57:38
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1279992 1 -74.00172 40.73539 -73.99198
## 1121016 2 -73.86301 40.76910 -73.98652
## 406908 5 -73.94592 40.78599 -73.97225
## 771860 1 -73.87315 40.77406 -73.68898
## 1404535 1 -73.94961 40.77686 -73.94212
## 1429984 1 -73.98979 40.74349 -73.99090
## 133222 2 -73.95290 40.77622 -73.99262
## 103198 2 -73.97441 40.74239 -73.99603
## 477841 1 -73.86289 40.76910 -73.95470
## 539804 1 -73.95792 40.76106 -73.94463
## dropoff_latitude store_and_fwd_flag trip_duration
## 1279992 40.74020 N 941
## 1121016 40.75035 N 2461
## 406908 40.79433 N 826
## 771860 41.04483 N 2704
## 1404535 40.78674 N 187
## 1429984 40.75052 N 369
## 133222 40.75843 N 1152
## 103198 40.73256 N 863
## 477841 40.77395 N 1385
## 539804 40.80746 N 643
df$bairro_saida = mapply(define_bairro, df$pickup_longitude, df$pickup_latitude)
df$bairro_chegada = mapply(define_bairro, df$dropoff_longitude, df$dropoff_latitude)
#Distancia em KM
df$dist_euclidiana = dist_eucl(df)
df$dist_manhattan = dist_manh(df)
df$velocidade = df$dist_manhattan / df$trip_duration
df$pickup_hour <- hour(df$pickup_datetime)
df$pickup_month <- month(df$pickup_datetime)
df$pickup_weekdays <- wday(df$pickup_datetime)
df$sen_hour <- sin(df$pickup_hour / 3.4)
df$sen_month <- sin(df$pickup_month / 1.7)
df$sen_week <- sin(df$pickup_weekdays)
plot( sin(seq(from = 1, to = 24/3.4, length.out = 70 )), type = 'o' )
df %>%
filter(df$dist_manhattan > 0.5) -> df
df %>%
filter(df$trip_duration < 10000) -> df
Regioes NY
df %>%
group_by(bairro_saida) %>%
count() -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~n, type = 'bar')
df %>%
group_by(bairro_chegada) %>%
count() -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~n, type = 'bar')
subplot(plot1, plot2, shareY = T)
df %>%
group_by(bairro_saida) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~velocidade_media, type = 'bar')
df %>%
group_by(bairro_chegada) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~velocidade_media, type = 'bar')
subplot(plot1, plot2, shareY = T)
p1 = plot_ly(data= df, x= ~passenger_count, y= ~trip_duration, type = 'scatter', mode = 'markers')
p2 = plot_ly(data= df, x= ~dist_manhattan, y= ~trip_duration, type = 'scatter', mode = 'markers') %>%
layout(title="Correlacao Num. Passageiros vs. Tempo | Correlacao Distancia vs. Tempo")
subplot(p1, p2)
df %>%
group_by(pickup_hour) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~velocidade_media, type = 'scatter', mode='lines')
df %>%
group_by(pickup_weekdays) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~velocidade_media, type = 'scatter', mode='lines') %>%
layout(title="Horas | Dias da Semana")
subplot(plot1, plot2, shareY = T)
df %>%
group_by(pickup_hour) %>%
count() -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~n, type = 'bar')
df %>%
group_by(pickup_weekdays) %>%
count() -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~n, type = 'bar') %>%
layout(title="Horas | Dias da Semana")
subplot(plot1, plot2)
heat_map_taxi(train, "pickup")
## OGR data source with driver: ESRI Shapefile
## Source: "D:\GITHUB\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields
heat_map_taxi(train, "dropoff")
## OGR data source with driver: ESRI Shapefile
## Source: "D:\GITHUB\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields
kmeans_data <- df[, c("trip_duration", "dist_euclidiana")]
boxplot(kmeans_data[, c("trip_duration")], las=1, xlab="trip_duration")
boxplot(kmeans_data[, c("dist_euclidiana")], las=1, xlab=c("dist_euclidiana"))
normalized <-(kmeans_data-min(kmeans_data))/(max(kmeans_data)-min(kmeans_data))
clusters <- kmeans(normalized, centers = 3)
plot(normalized, col=clusters$cluster, pch=21, cex=1)
df %>%
mutate(pickup_time_in_minutes = minute(pickup_datetime) + hour(pickup_datetime) * 60) %>%
mutate(hour_quarter = pickup_time_in_minutes %/% 15) -> df
head(df, 3)
## id vendor_id pickup_datetime dropoff_datetime
## 1 id1222043 1 2016-04-23 14:30:31 2016-04-23 14:46:12
## 2 id1532438 1 2016-06-29 13:56:47 2016-06-29 14:37:48
## 3 id2622202 2 2016-06-29 19:06:05 2016-06-29 19:19:51
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1 1 -74.00172 40.73539 -73.99198
## 2 2 -73.86301 40.76910 -73.98652
## 3 5 -73.94592 40.78599 -73.97225
## dropoff_latitude store_and_fwd_flag trip_duration bairro_saida
## 1 40.74020 N 941 5
## 2 40.75035 N 2461 8
## 3 40.79433 N 826 4
## bairro_chegada dist_euclidiana dist_manhattan velocidade pickup_hour
## 1 5 0.9800723 1.356218 0.001441252 14
## 2 5 10.6208499 12.499100 0.005078870 13
## 3 3 2.4052090 3.147032 0.003809967 19
## pickup_month pickup_weekdays sen_hour sen_month sen_week
## 1 4 7 -0.8282931 0.7094035 0.6569866
## 2 6 4 -0.6302978 -0.3781704 -0.7568025
## 3 6 4 -0.6403470 -0.3781704 -0.7568025
## pickup_time_in_minutes hour_quarter
## 1 870 58
## 2 836 55
## 3 1146 76
df %>%
group_by(hour_quarter) %>%
summarise(count = n()) -> hour_quarter_freq
plot(hour_quarter_freq, type = "o", main="Grafico de linha temporal por quartos de hora", xlab="Quarto de hora", ylab="Numero de viagens")
df %>%
group_by(pickup_month) %>%
summarise(count = n()) -> month_freq
plot(month_freq, type = "o", main="Grafico de linha temporal mensal", xlab="Mês", ylab="Número de viagens")
df %>%
group_by(pickup_weekdays) %>%
summarise(count = n()) -> weekday_freq
plot(weekday_freq, type = "o", main="Gráfico de linha temporal por dia da semana", xlab="Dia da semana", ylab="Número de viagens")
bairro_dummy = dummy(df$bairro_chegada, sep='_')
df = data.frame(cbind(df, bairro_dummy))
df$dist_manhattan = rescale(df$dist_manhattan)
df$trip_duration = rescale(df$trip_duration)
df$passenger_count = rescale(df$trip_duration)